{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "UfYIQNU1WH_H"
   },
   "source": [
    "## Mount drive and set current directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 20585,
     "status": "ok",
     "timestamp": 1615418966011,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "JFf-AvwE0llf",
    "outputId": "a4161083-6646-4d26-f1de-805e1de8d9bc"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "E:\\Dropbox\\Optimal Training Sets\\Replication File v4\n"
     ]
    }
   ],
   "source": [
    "# Importing the required libraries\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import random\n",
    "os.chdir(\"..\")\n",
    "print(os.getcwd())\n",
    "\n",
    "# Set the random seed\n",
    "random.seed(10012)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "dKdmHi04fnNC"
   },
   "source": [
    "## Load train and test sets from files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "Ia2NlV1BAHHP"
   },
   "outputs": [],
   "source": [
    "dataset_name = 'eo_'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "ElQOwp0KP0nW"
   },
   "outputs": [],
   "source": [
    "#Reading in the datasets\n",
    "#We are using the pd.read_csv function to read in the dataset and set which columns to be used as the index of the dataframe\n",
    "data_full = pd.read_csv(\"data/raw/\" + dataset_name+'clean_full.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "KI5P6zvFP86u"
   },
   "source": [
    "## Count Vectors and TF-IDF Vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 25441,
     "status": "ok",
     "timestamp": 1615418130893,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "NRXIWszNfqjz",
    "outputId": "37b1bef0-122d-436c-eb80-c527e810971e"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10218, 8077)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Importing in the required libraries\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "\n",
    "stp_wrds = ['a', 'an', 'the', 'of', 'and', 'but', 'or', 'of', 'to']\n",
    "# We are using custom stop words here as stp_words which we have defined above\n",
    "pipe = Pipeline([('vect', CountVectorizer(stop_words=stp_wrds, ngram_range=(1, 3))), \\\n",
    "                 ('selector', VarianceThreshold(threshold=0.028))])\n",
    "# fit_transform calls both fit() and transform() on the same data. This is used on the training data so that we can scale\n",
    "# the training data and learn the scaling parameters.\n",
    "cvec_full = pipe.fit_transform(data_full['text'], data_full['label'])\n",
    "\n",
    "# This is used for pre-processing before modelling\n",
    "#cvec_test = pipe.transform(data_test['text'])\n",
    "\n",
    "cvec_full.shape#, cvec_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "id": "G2kiRQgRxUX9"
   },
   "outputs": [],
   "source": [
    "temp = pd.DataFrame(cvec_full.toarray())\n",
    "temp['label'] = data_full['label'].tolist()\n",
    "temp.to_csv(\"data/output/\" + dataset_name+'cvec_full.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 34468,
     "status": "ok",
     "timestamp": 1615418184489,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "rQrx4gBiMaDN",
    "outputId": "6a8b8e7b-84b7-4294-ac24-e0d4e637b3ab"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10218, 7932)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stp_wrds = ['a', 'an', 'the', 'of', 'and', 'but', 'or', 'of', 'to']\n",
    "pipe = Pipeline([('vect', TfidfVectorizer(stop_words=stp_wrds, ngram_range=(1, 3))), \\\n",
    "                 ('selector', VarianceThreshold(threshold=0.00001))])\n",
    "tfidf_full = pipe.fit_transform(data_full['text'], data_full['label'])\n",
    "tfidf_full.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "id": "MI3-KUAnNBkc"
   },
   "outputs": [],
   "source": [
    "temp = pd.DataFrame(tfidf_full.toarray())\n",
    "temp['label'] = data_full['label'].tolist()\n",
    "temp.to_csv(\"data/output/\"+dataset_name+'tfidf_full.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "id": "NRDqBHMU2bCM"
   },
   "outputs": [],
   "source": [
    "del temp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "JnipLh-qayco"
   },
   "source": [
    "## Sentence/Doc-Level Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "id": "0VEj4OuB58Et"
   },
   "outputs": [],
   "source": [
    "#from bert_embedding import BertEmbedding\n",
    "#bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XQntuWYk4Pwk"
   },
   "source": [
    "### BERT-based"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "executionInfo": {
     "elapsed": 8904,
     "status": "ok",
     "timestamp": 1615418337856,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "s8p37An04Oov",
    "outputId": "88623027-1af8-4e56-d548-03dbcc449f5f"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nroberta-base-nli-stsb-mean-tokens\\nbert-base-nli-stsb-mean-tokens\\ndistilroberta-base-paraphrase-v1\\n'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "'''\n",
    "roberta-base-nli-stsb-mean-tokens\n",
    "bert-base-nli-stsb-mean-tokens\n",
    "distilroberta-base-paraphrase-v1\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 119984,
     "status": "ok",
     "timestamp": 1615418450395,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "r4oSC3vQ3Sll",
    "outputId": "20edb845-f228-48f5-fdaa-f1192c837c38"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10218, 768)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sbert_model = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')\n",
    "sentence_embeddings = sbert_model.encode(data_full['text'].tolist())\n",
    "sentence_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "id": "tPkBpZx2O6-c"
   },
   "outputs": [],
   "source": [
    "pd.DataFrame(sentence_embeddings).to_csv(\"data/output/\"+dataset_name+\"roberta_full.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 103645,
     "status": "ok",
     "timestamp": 1615418562377,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "jMUGivkSn4BC",
    "outputId": "70212647-120e-4962-832b-d6aff8e8cfbe"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10218, 768)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sbert_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')\n",
    "sentence_embeddings = sbert_model.encode(data_full['text'].tolist())\n",
    "sentence_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "id": "tgMJTDYVoRTc"
   },
   "outputs": [],
   "source": [
    "pd.DataFrame(sentence_embeddings).to_csv(\"data/output/\"+dataset_name+\"bert_full.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 171944,
     "status": "ok",
     "timestamp": 1615418630703,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "9BKMOGYYpP7d",
    "outputId": "0ec9dd78-70b6-468d-909e-6697f3d49523"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10218, 768)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sbert_model = SentenceTransformer('distilroberta-base-paraphrase-v1')\n",
    "sentence_embeddings = sbert_model.encode(data_full['text'].tolist())\n",
    "sentence_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "id": "TQyERo9epQFC"
   },
   "outputs": [],
   "source": [
    "pd.DataFrame(sentence_embeddings).to_csv(\"data/output/\"+dataset_name+\"distil_full.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "xigoz_mvOrZ0"
   },
   "source": [
    "### GloVe Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 279738,
     "status": "ok",
     "timestamp": 1606657831774,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Ggtud541LE-7b_PbBmTGtGkNn9nRFwEQ3keJswI6Q=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "FXts_UZ2qkj-",
    "outputId": "e9ccf3da-125f-4d25-ecac-9ef8f354a5f2"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nsbert_model = SentenceTransformer(\\'average_word_embeddings_glove.840B.300d\\')\\nsentence_embeddings = sbert_model.encode(data_train[\\'text\\'].tolist())\\nsentence_embeddings.shape\\n\\npd.DataFrame(sentence_embeddings).to_csv(dataset_name+\"glove840B_train.csv\")\\n'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "sbert_model = SentenceTransformer('average_word_embeddings_glove.840B.300d')\n",
    "sentence_embeddings = sbert_model.encode(data_train['text'].tolist())\n",
    "sentence_embeddings.shape\n",
    "\n",
    "pd.DataFrame(sentence_embeddings).to_csv(dataset_name+\"glove840B_train.csv\")\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 28119,
     "status": "ok",
     "timestamp": 1615418667272,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "CJbThfZUrSWv",
    "outputId": "57f8fd00-7f9e-47ed-b3e1-711062c5c702"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10218, 300)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sbert_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')\n",
    "sentence_embeddings = sbert_model.encode(data_full['text'].tolist())\n",
    "sentence_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "id": "c2eOKp3WrWCh"
   },
   "outputs": [],
   "source": [
    "pd.DataFrame(sentence_embeddings).to_csv(\"data/output/\"+dataset_name+\"glove6B_full.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "5v40A5g1UU9a"
   },
   "source": [
    "### Universal Sentence Encoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "id": "KDpd0Rv6Uj6U"
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_hub as hub\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 83276,
     "status": "ok",
     "timestamp": 1615418838503,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "frD_QL3aUkG0",
    "outputId": "8128f418-49e4-4a17-8289-c82ea3cfb4b5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "module https://tfhub.dev/google/universal-sentence-encoder/4 loaded\n"
     ]
    }
   ],
   "source": [
    "module_url = \"https://tfhub.dev/google/universal-sentence-encoder/4\" \n",
    "model = hub.load(module_url)\n",
    "print (\"module %s loaded\" % module_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 245,
     "status": "ok",
     "timestamp": 1615418838788,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "emz6rGIhx5Vs",
    "outputId": "a6f7357b-8d0c-4541-afdd-a6a1f63b54f4"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10218"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence_list = data_full['text'].tolist()\n",
    "len(sentence_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 57174,
     "status": "ok",
     "timestamp": 1615418895735,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 300
    },
    "id": "CNZ7LLerUpwD",
    "outputId": "f2b40ee8-3e74-4f4b-eb45-bb206e8b3b01"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10218, 512)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence_embeddings = []\n",
    "for i in range(len(sentence_list)):\n",
    "  sentence_embeddings.append(np.array(model([sentence_list[i]])[0]))\n",
    "np.array(sentence_embeddings).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "id": "NEd89D7C1uy4"
   },
   "outputs": [],
   "source": [
    "pd.DataFrame(np.array(sentence_embeddings)).to_csv(\"data/output/\"+dataset_name+\"universal_full.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "uXgIrJZro_aZ"
   },
   "source": [
    "## Dimension Reduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "id": "ijYznRILo4_w"
   },
   "outputs": [],
   "source": [
    "cvec_full = pd.read_csv(\"data/output/\"+dataset_name+'cvec_full.csv', index_col=0)\n",
    "tfidf_full =  pd.read_csv(\"data/output/\"+dataset_name+'tfidf_full.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "65wWu7EGpGgb"
   },
   "source": [
    "### PCA, UMAP, NMF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "id": "ot8pWUwUlTcS"
   },
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "pd.DataFrame(PCA(n_components=16).fit_transform(cvec_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'cvec_pca16_full.csv')\n",
    "pd.DataFrame(PCA(n_components=16).fit_transform(tfidf_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'tfidf_pca16_full.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "id": "Obx7o1onmxOK"
   },
   "outputs": [],
   "source": [
    "import umap.umap_ as umap\n",
    "pd.DataFrame(umap.UMAP(n_components=16).fit_transform(cvec_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'cvec_umap16_full.csv')\n",
    "pd.DataFrame(umap.UMAP(n_components=16).fit_transform(tfidf_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'tfidf_umap16_full.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "id": "Qf-hFdxQqCpa"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ak8096\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\decomposition\\_nmf.py:1692: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from sklearn.decomposition import NMF\n",
    "pd.DataFrame(NMF(n_components=16, init='nndsvd').fit_transform(cvec_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'cvec_nmf16_full.csv')\n",
    "pd.DataFrame(NMF(n_components=16, init='nndsvd').fit_transform(tfidf_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'tfidf_nmf16_full.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ak8096\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\n",
      "  warnings.warn(\n",
      "C:\\Users\\ak8096\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\manifold\\_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from sklearn.manifold import TSNE\n",
    "pd.DataFrame(TSNE(n_components=2, init='random').fit_transform(cvec_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'cvec_tsne16_full.csv')\n",
    "pd.DataFrame(TSNE(n_components=2, init='random').fit_transform(tfidf_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'tfidf_tsne16_full.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zFnpz8-5TlzF"
   },
   "source": [
    "* BERT\n",
    "* DistilBERT\n",
    "* RoBERTa\n",
    "* Universal Sentence Encoder\n",
    "* Glove6B\n",
    "* pca16-cvec\n",
    "* pca16-tfidf\n",
    "* umap16-cvec\n",
    "* umap16-tfidf\n",
    "* nmf16-cvec\n",
    "* nmf16-tfidf\n",
    "* tsne16-cvec\n",
    "* tsne16-tfidf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Topic Modeling with LDA (Taddy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "pd.DataFrame(LatentDirichletAllocation(n_components=100, random_state=0).fit_transform(cvec_full.drop(columns=['label']))).to_csv(\"data/output/\"+dataset_name+'lda100_full.csv')"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "authorship_tag": "ABX9TyPo6o/CapDe4ia7PetNx520",
   "collapsed_sections": [],
   "name": "Embeddings.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
